rm(list=ls())
# install.packages("tidytext")
library("tidytext")
# install.packages("tidyverse")
library("tidyverse")
## ── Attaching packages ─────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.8
## ✔ tidyr 0.8.2 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
# install.packages("igraph")
library("igraph")
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
# install.packages("reshape2")
library("reshape2")
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
# install.packages("wordcloud")
library("wordcloud")
## Loading required package: RColorBrewer
# install.packages("ggraph")
library("ggraph")
# install.packages("widyr")
library("widyr")
data <- readLines("Frank Herbert - Dune.txt_clean")
head(data)
## [1] "= = = = = = "
## [2] ""
## [3] "A beginning is the time for taking the most delicate care that the balances are correct. This every sister of the Bene Gesserit knows. To begin your study of the life of Muad'Dib, then, take care that you first place him in his time: born in the 57th year of the Padishah Emperor, Shaddam IV. And take the most special care that you locate Muad'Dib in his place: the planet Arrakis. Do not be deceived by the fact that he was born on Caladan and lived his first fifteen years there. Arrakis, the planet known as Dune, is forever his place."
## [4] "-from \"Manual of Muad'Dib\" by the Princess Irulan"
## [5] ""
## [6] " In the week before their departure to Arrakis, when all the final scurrying about had reached a nearly unbearable frenzy, an old crone came to visit the mother of the boy, Paul."
data_split <- data_frame(line = 1:length(data), text = data)
head(data_split)
#data_split[data_split$line == "= = = = = =",]
data_split$chapitre <- cumsum(as.integer(data_split$text=="= = = = = ="))+1
dim(data_split)
## [1] 8048 3
head(data_split)
#le dataframe qui ne contient pas de ligne '= = = = = = "
data_chapter <- data_split[-which(data_split$text=="= = = = = ="),]
#nombre de ligne pour chaque ligne
table(data_chapter$chapitre)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 173 125 111 148 157 158 121 54 135 76 235 86 49 379 289 216 65 175
## 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## 51 210 211 4 73 204 222 213 249 124 158 95 177 122 193 216 297 148
## 37 38 39 40 41 42 43 44 45 46 47 48
## 247 4 143 116 341 121 277 214 159 155 136 368
En observant la table de fréquence, chapitre 22 et 38 ne contient que 4 lignes.
data_chapter[which(data_chapter$chapitre %in% c(22,38)),]
En fait, ce sont des lignes qui séparent les différentes grandes parties du livre. Dune contient 3 grandes parties : Book 1 , Book 2 et Book 3.
#récupérer les noms de chqpitres qui ne contiennent que 4 lignes
sep_partie <- as.integer(names(which(table(data_chapter$chapitre)==4)))
#ajouter la colonne "partie"
data_chapter$partie <- cumsum(as.integer(data_chapter$chapitre %in% sep_partie))+1
#ajuster la colonne "chapitre" car elle va être décalé en raison de la suppression des chapitres 22 et 38
data_chapter <- data_chapter[-which(data_chapter$chapitre %in% sep_partie),]
# data_clean : 46 facteurs
# data_brut : 51 facteurs
data_chapter$chapitre = as.factor(data_chapter$chapitre)
levels(data_chapter$chapitre) = as.factor(1:51)
data_chapter$partie = as.factor(data_chapter$partie)
levels(data_chapter$partie) = as.factor(1:3)
message("Nombre de ligne par chapitre")
## Nombre de ligne par chapitre
table(data_chapter$chapitre)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 173 125 111 148 157 158 121 54 135 76 235 86 49 379 289 216 65 175
## 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## 51 210 211 73 204 222 213 249 124 158 95 177 122 193 216 297 148 247
## 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
## 143 116 341 121 277 214 159 155 136 368 0 0 0 0 0
message("Nombre de ligne par partie")
## Nombre de ligne par partie
table(data_chapter$partie)
##
## 1 2 3
## 3224 2738 2030
head(data_chapter)
data_chapter <- data_chapter[-which(data_chapter$text %in% c("= = = = = =","")),]
data_chapter <- data_chapter[-1,]
data_chapter$line <- 1:nrow(data_chapter)
head(data_chapter)
data_part <- data_chapter %>%
unnest_tokens(word, text)
# set factor to keep books in order of publication
data_part$partie <- factor(data_part$partie, levels = rev(1:3))
data_part
data_part %>%
group_by(partie) %>%
mutate(word_count = 1:n(),
index = word_count %/% 500 + 1) %>%
inner_join(get_sentiments("bing")) %>%
count(partie, index = index , sentiment) %>%
ungroup() %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative,
partie = factor(partie, levels = 1:3)) %>%
ggplot(aes(index, sentiment, fill = partie)) +
geom_bar(alpha = 0.5, stat = "identity", show.legend = FALSE) +
facet_wrap(~ partie, ncol = 2, scales = "free_x")
## Joining, by = "word"
phrase <- c()
#stock les phrases
for (i in data_chapter$text){
phrase <- c(phrase, unlist(str_split(i, "\\. \"|\\.\"|[.][:space:]+(?=[:upper:])")))
}
phrase <- gsub("\\\"|[.]","",phrase)
df_sentence <- data.frame(line = 1:length(phrase),sentence = phrase, stringsAsFactors = FALSE)
head(df_sentence)
# install.packages("sentimentr")
library(sentimentr)
senti <- c()
for (line in df_sentence$sentence) {
tmp <- get_sentences(line)
for(i in 1:length(tmp[[1]])) {
senti_tmp <- tmp[[1]][i]
senti <- c(senti, senti_tmp)
}
}
df_sentr <- data.frame(senti, stringsAsFactors = FALSE)
df_sentr$senti <- as.character(df_sentr$senti)
sentiment <- sentiment(df_sentr$senti)
## Warning: Each time `sentiment` is run it has to do sentence boundary disambiguation when a
## raw `character` vector is passed to `text.var`. This may be costly of time and
## memory. It is highly recommended that the user first runs the raw `character`
## vector through the `get_sentences` function.
df_sentr$sentiment <- as.numeric(sentiment$sentiment)
df_sentr$pntag <- ifelse(sentiment$sentiment == 0, 'Neutral',
ifelse(sentiment$sentiment > 0, 'Positive',
ifelse(sentiment$sentiment < 0, 'Negative', 'NA')))
# base R plot
plot(df_sentr$sentiment, type='l', pch=3)
# plotly- more fun
ax <- list(
title = "Phrase",
zeroline = FALSE,
showline = FALSE,
showticklabels = FALSE
)
# install.packages('plotly')
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:sentimentr':
##
## highlight
## The following object is masked from 'package:igraph':
##
## groups
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
plot_ly(data = df_sentr, x = ~senti, y = ~sentiment, color = ~pntag,
type = 'scatter', mode = 'markers') %>% layout(xaxis = ax)
data <- data[-which(data_split$text %in% c("= = = = = =",""))]
data <- data[-1]
df_initial <- data_frame(line = 1:length(data), text = data)
head(df_initial)
df <- df_initial %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
## Joining, by = "word"
head(df)
df %>%
count(word, sort = TRUE) %>%
filter(n > 300) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab("Tokens") +
coord_flip()
df %>%
count(word, sort = TRUE)
df %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray10", "gray50"),
max.words = 200, scale=c(3,.20))
## Joining, by = "word"
freq_by_rank <- df_initial %>%
unnest_tokens(word, text) %>%
count(word, sort = TRUE) %>%
ungroup() %>%
mutate(rank = row_number(),
total = sum(n),
'term frequency' = n / total)
dim(freq_by_rank)
## [1] 11475 5
head(freq_by_rank)
rank_subset <- freq_by_rank %>%
filter(rank < 500,
rank > 10)
lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)
##
## Call:
## lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)
##
## Coefficients:
## (Intercept) log10(rank)
## -0.8387 -1.0192
freq_by_rank %>%
ggplot(aes(rank, `term frequency`)) +
geom_abline(intercept = -0.837, slope = -1.0192, color = "gray50", linetype = 2) +
geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10()
freq_by_rank %>%
select(-total) %>%
arrange(desc(`term frequency`))
bigrams2 <- df_initial %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
na.omit()%>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
count(word1, word2, sort = TRUE)
trigrams3 <- df_initial %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
na.omit()%>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
filter(!word3 %in% stop_words$word) %>%
count(word1, word2, word3, sort = TRUE)
quadrigrams4 <- df_initial %>%
unnest_tokens(word, text, token = "ngrams", n = 4) %>%
separate(word, c("word1", "word2", "word3", "word4"), sep = " ") %>%
na.omit()%>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
filter(!word3 %in% stop_words$word) %>%
filter(!word4 %in% stop_words$word) %>%
count(word1, word2, word3, word4, sort = TRUE)
quintigrams5 <- df_initial %>%
unnest_tokens(word, text, token = "ngrams", n = 5) %>%
separate(word, c("word1", "word2", "word3", "word4", "word5"), sep = " ") %>%
na.omit()%>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
filter(!word3 %in% stop_words$word) %>%
filter(!word4 %in% stop_words$word) %>%
filter(!word5 %in% stop_words$word) %>%
count(word1, word2, word3, word4, word5, sort = TRUE)
head(bigrams2)
head(trigrams3)
head(quadrigrams4)
head(quintigrams5)
bigrams2_filtered <- bigrams2 %>%
unite(bigram, word1, word2, sep = " ")
trigrams3_filtered <- trigrams3 %>%
unite(bigram, word1, word2, word3, sep = " ")
quadrigrams4_filtered <- quadrigrams4 %>%
unite(bigram, word1, word2, word3, word4, sep = " ")
quintigrams5_filtered <- quintigrams5 %>%
unite(bigram, word1, word2, word3, word4, word5, sep = " ")
bigrams2_filtered
trigrams3_filtered
quadrigrams4_filtered
quintigrams5_filtered
bigram_graph <- bigrams2 %>%
filter(n > 10) %>%
graph_from_data_frame()
ggraph(bigram_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)
bigram_graph <- bigrams2 %>%
filter(n > 20) %>%
graph_from_data_frame()
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
ggraph(bigram_graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()
section_words <- df_initial %>%
mutate(section = row_number() %/% 10) %>%
filter(section > 0) %>%
unnest_tokens(word, text) %>%
filter(!word %in% stop_words$word)
word_pairs <- section_words %>%
pairwise_count(word, section, sort = TRUE)
word_pairs
word_cors <- section_words %>%
group_by(word) %>%
filter(n() >= 50) %>%
pairwise_cor(word, section, sort = TRUE)
dim(word_cors)
## [1] 47306 3
word_cors %>%
filter(correlation > .20) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void()
# # Load library
# library(cleanNLP)
#
# library(reticulate)
#
# # Setting up NLP backend
# cnlp_init_spacy()
#
# # Get text book 1
# book1 <- paste(data_chapter$text[which(data_chapter$partie==1)], collapse = " ")
#
# obj1 <- cnlp_annotate(book1, as_strings = TRUE)
#
# head(obj1)
#
# # Find the named entities in our text
# people <- cnlp_get_entity(obj1) %>%
# filter(entity_type == "PERSON") %>%
# group_by(entity) %>%
# count %>%
# arrange(desc(n))
#
# # Show the top 20 characters by mention
# people
# DATA PREPARATION ####
group_data <- c("House Atreides","House Harkonnen","Bene Gesserit","Bene Tleilax","Spacing Guild",
"Honored Matres" ,"Fremen","Miscellaneous" )
regex_groupe <- paste(group_data, collapse = "|") # regular expression
df_sentence <- df_sentence[-which(df_sentence$sentence %in% c(""," ")),]
df_sentence$line <- 1:nrow(df_sentence)
# LOAD IN BOOK TEXT
groupes_sentences <- df_sentence %>%
filter(grepl(regex_groupe, sentence)) %>% # exclude sentences without house reference
cbind(sapply(group_data, function(x) grepl(x, .$sentence)))# identify references
# examine
max.char = 35 # define max sentence length
groupes_sentences %>%
mutate(sentence = ifelse(nchar(sentence) > max.char, # cut off long sentences
paste0(substring(sentence, 1, max.char), "..."),
sentence)) %>%
head(5)
# custom capitalization function
Capitalize <- function(text){
paste0(substring(text,1,1) %>% toupper(),
substring(text,2))
}
# TO LONG FORMAT
groupe_long <- groupes_sentences %>%
gather(key = group_data, value = test, -sentence, -line) %>%
mutate(group = group_data) %>% # capitalize names
filter(test) %>% select(-c(group_data,test)) # delete rows where house not referenced
# examine
groupe_long %>%
mutate(sentence = ifelse(nchar(sentence) > max.char, # cut off long sentences
paste0(substring(sentence, 1, max.char), "..."),
sentence)) %>%
head(20)
# set plot width & height
w = 19; h = 6
# PLOT REFERENCE FREQUENCY
groupe_long %>%
group_by(group) %>%
summarize(n = n()) %>% # count sentences per house
ggplot(aes(x = desc(group), y = n)) +
geom_bar(aes(fill = group), stat = 'identity')+
geom_text(aes(y = n+50, label = group),position = position_dodge(0.9),
vjust = 0) +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
legend.position = 'none') +
coord_flip()
# IDENTIFY WORDS USED IN COMBINATION WITH HOUSES
words_by_groupes <- groupe_long %>%
unnest_tokens(word, sentence, token = 'words') %>% # retrieve words
mutate(word = gsub("'s", "", word)) %>% # remove possesive determiners
group_by(group, word) %>%
summarize(word_n = n()) # count words per house
# examine
words_by_groupes %>% head()
# custom functions for reordering facet plots
# https://github.com/dgrtwo/drlib/blob/master/R/reorder_within.R
reorder_within <- function(x, by, within, fun = mean, sep = "___", ...) {
new_x <- paste(x, within, sep = sep)
reorder(new_x, by, FUN = fun)
}
scale_x_reordered <- function(..., sep = "___") {
reg <- paste0(sep, ".+$")
ggplot2::scale_x_discrete(labels = function(x) gsub(reg, "", x), ...)
}
# set plot width & height
w = 10; h = 7;
# PLOT MOST FREQUENT WORDS PER HOUSE
words_per_groupe = 20 # set number of top words
words_by_groupes %>%
group_by(group) %>%
arrange(group, desc(word_n)) %>%
mutate(top = row_number()) %>% # count word top position
filter(top <= words_per_groupe) %>% # retain specified top number
ggplot(aes(reorder_within(word, -top, group), # reorder by minus top number
word_n, fill = group)) +
geom_col(show.legend = F) +
scale_x_reordered() +
facet_wrap(~ group, scales = "free_y") + # facet wrap and free y axis
coord_flip()
words_by_groupes <- words_by_groupes %>%
group_by(word) %>% mutate(word_sum = sum(word_n)) %>% # counts words overall
group_by(group) %>% mutate(group_n = n()) %>%
ungroup() %>%
# compute ratio of usage in combination with house as opposed to overall
# adjusted for house references frequency as opposed to overall frequency
mutate(ratio = (word_n / (word_sum - word_n + 1) / (group_n / n())))
# examine
words_by_groupes %>% select(-word_sum, -group_n) %>% arrange(desc(word_n)) %>% head()
words_by_groupes %>%
group_by(group) %>%
arrange(group, desc(ratio)) %>%
mutate(top = row_number()) %>% # count word top position
filter(top <= words_per_groupe) %>% # retain specified top number
ggplot(aes(reorder_within(word, -top, group), # reorder by minus top number
ratio, fill = group)) +
geom_col(show.legend = F) +
scale_x_reordered() +
facet_wrap(~ group, scales = "free") + # facet wrap and free scales
coord_flip()